# predict housing prices
# -------------------------------------------------------------------- #
# prelims ####
# -------------------------------------------------------------------- #
# clear environment
rm(list = ls())

# packages
library(haven)
library(tidyverse)
library(rpart)	
library(rpart.plot)	
require(caTools)
library(rsample)
library(ranger)

# reproducibility
set.seed(123)

# paths
dir <- paste0("PATH_TO_MAIN_DIRECTORY_HERE")

# -------------------------------------------------------------------- #
# prepare data ####
# -------------------------------------------------------------------- #
## load data
# directory
ATC <- read_dta(paste0(dir, "data/int/ATC_predict.dta"))

# correct variables wrongly classified
ATC <- ATC %>%
  transform(
  CDIS=as.factor(CDIS),
  CUSEC=as.factor(CUSEC),
  BARRI=as.factor(BARRI),
  FirstSale=as.factor(FirstSale),
  RecentRef=as.factor(RecentRef)
)

# get variables of interest
ATC <- ATC %>% subset(select = -c(FloorMatch, PlotCode))
ATC <- ATC %>% subset(select = -c(CSEC))
ATC <- ATC %>% subset(select = -c(CUSEC))

# check variables class
sapply(ATC, class)

# check no missing values
colSums(is.na(ATC))

# -------------------------------------------------------------------- #
# prepare samples ####
# -------------------------------------------------------------------- #
vars <- character(0)
vars <- c(vars,names(ATC)[sapply(ATC,is.numeric)])
varslog <- vars[grepl(vars,pattern="^ln")] 
varsleveldrop <- gsub('^ln', '',varslog) 
varsleveldrop <- varsleveldrop[varsleveldrop %in% vars]

ATC$PRICE <- ATC$lnP 

# drop variables in levels
ATC <- ATC %>% dplyr::select(-c(all_of(varsleveldrop)))
ATC <- ATC %>% dplyr::select(-c(price, lnP))

# -------------------------------------------------------------------- #
# random forest ####
# -------------------------------------------------------------------- #
# split samples
split <- initial_split(ATC, prop = .8)
train <- training(split)
test  <- testing(split)

## Tunning -----------------------------------------------------------
## ------------------------------------------------------------------- ##

# variables to use in each split
mtry_def <- floor(length(ATC) / 3) 
mtry_l <- mtry_def - 4
mtry_h <- mtry_def + 6
node_grid <- seq(5, 9, by = 2)

# hyperparameter grid
hyper_grid <- expand.grid(
  mtry       = seq(mtry_l, mtry_h, by = 3),
  node_size  = node_grid,
  sample_size = seq(0.75, 0.80, by = 0.05),
  OOB_RMSE  = 0
)

# number of combinations
nrow(hyper_grid)

# print time
print(Sys.time())

# train model
for(i in 1:nrow(hyper_grid)) {
  
  model <- ranger(
    formula         = PRICE ~ ., 
    data            = train, 
    num.trees       = 500,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sample_size[i],
    seed            = 123,
    importance      = 'impurity'
  )
  
  # add OOB error to grid
  hyper_grid$OOB_RMSE[i] <- model$prediction.error
}

hyper_grid %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(10)

# get best parameters
best <- hyper_grid %>% 
  dplyr::arrange(OOB_RMSE) 
node_size_best <- best$node_size[1]
sample_size_best <- best$sample_size[1]
mtry_best <- best$mtry[1]

## Final tree --------------------------------------------------------
## ------------------------------------------------------------------- ##
system.time(
  ranger <- ranger(
    formula         = PRICE ~ ., 
    data            = train, 
    num.trees       = 500, 
    mtry            = mtry_best , 
    min.node.size   = node_size_best, 
    sample.fraction = sample_size_best,
    seed            = 123,
    importance      = 'impurity'
  )
)

# OOB Error
print(ranger$prediction.error)

# print time
print(Sys.time())

# store random forest
save(ranger,file = paste0(dir, "data/int/predict_price_ranger.RData"))

# -------------------------------------------------------------------- #
# predict prices ####
# -------------------------------------------------------------------- #
# load random forest
load(paste0(dir, "data/int/predict_price_ranger.RData"))

firstyear <- 2019
lastyear <- 2009

# predict housing prices
for(yr in firstyear:lastyear){

  # load catastro data
  DataName <- paste0(dir, "data/temp/catdata_predict_", yr, ".dta")
  CatData <- read_dta(DataName)

  # reclassify vars
  CatData <- CatData %>% 
    transform(
    CDIS=as.factor(CDIS),
    CSEC=as.factor(CSEC),
    CUSEC=as.factor(CUSEC),
    BARRI=as.factor(BARRI),
    FirstSale=as.factor(FirstSale),
    RecentRef=as.factor(RecentRef)
  )
  
  # keep existing plots
  RestData <- CatData[CatData$YearConst <= yr, ]
  RestData$year <- yr
  
  # predict prices
  pred_Catastro <- predict(ranger, RestData)
  
  PHat <- as.data.frame(pred_Catastro$predictions)
  
  Prices <- CatData %>% dplyr::select(c(PlotCode, PropOrder, PropOrder2))
  Prices <- cbind(Prices, PHat)

  Prices$PHat <- exp(Prices$`pred_Catastro$predictions`)
  
  Prices <- Prices %>% dplyr::select(-c(`pred_Catastro$predictions`))
  
  # store
  filename_R <- paste0(dir, "data/temp/PredPriceForest_08_900_", yr, ".RData")
  save(Prices, file = filename_R)

}

# append data
for(yr in firstyear:lastyear){
  
  # load
  filename_R <- paste0(dir, "data/temp/PredPriceForest_08_900_", yr, ".RData")
  load(filename_R)
  
  # rename var
  newname <- paste("PHat", yr, sep = "")
  names(Prices)[names(Prices) == "PHat"] <- newname
  
  # append
  if(yr == firstyear){
    PriceYrs <- Prices
  }else{
    PriceYrs <- merge(x = PriceYrs, y = Prices, 
                       by = c("PlotCode","PropOrder", "PropOrder2"),
                       all.x = TRUE)
  }
  
  # store
  write_dta(data = PriceYrs, paste0(dir, "data/int/PredPriceForest_08_900.dta"))
  
}

# -------------------------------------------------------------------- #
# compare predicted vs actual price in ATC data ####
# -------------------------------------------------------------------- #
# load random forest
load(paste0(dir, "data/int/predict_price_ranger.RData"))

# load ATC data
ATC <- read_dta(paste0(dir, "data/int/ATC_predict.dta"))

# correct variables wrongly classified
ATC <- ATC %>%
  transform(
    CDIS=as.factor(CDIS),
    CUSEC=as.factor(CUSEC),
    BARRI=as.factor(BARRI),
    FirstSale=as.factor(FirstSale),
    RecentRef=as.factor(RecentRef)
  )

# predict prices
pred_ATC <- predict(ranger, ATC)

PHat <- as.data.frame(pred_ATC$predictions)

Prices <- ATC %>% dplyr::select(c(PlotCode, price, year, CUSEC, BARRI,CDIS))
Prices <- cbind(Prices, PHat)

Prices$PHat <- exp(Prices$`pred_ATC$predictions`)

Prices <- Prices %>% dplyr::select(-c(`pred_ATC$predictions`))

# store
filename <- paste0(dir, "data/int/PredPriceForest_ATC")
write_dta(data = Prices, paste0(filename,".dta"))

# -------------------------------------------------------------------- #
# closing ####
# -------------------------------------------------------------------- #
rm(list = ls())
